In this competition, we have to find whether the user will repeat the song within the next one month.
Loading the required Packages
library(tidyverse)
library(feather)
library(data.table)
library(viridis)
library(DT)
library(lubridate)
library(magrittr)
options(tibble.print_max = 5, tibble.print_min = 5)Let’s start with EDA on individual data frame and then proceed to their interactions,
We will see how each variable in the train DF affects the target.
Reading in the train data set,
source_system_tab, source_screen_name and source_type are categorical.
Let’s see how these variables affect the target.
Defining useful functions ,
## ggplot setting for readable labels
readable_labs <- theme(axis.text=element_text(size=12),
axis.title=element_text(size=14),
plot.title = element_text(hjust = 0.5))
# Function to dislpay count of each category of the column and plot how it affects target
target_vs_column <-function(df, col_name, x , y, title)
{
temp_df <- df %>%
group_by_(col_name) %>%
summarize(count = n(), mean_target = mean(target)) %>%
arrange(desc(mean_target))
df_plot <- temp_df %>%
ggplot(aes_string(col_name, "mean_target")) +
geom_col(aes(fill=count)) +
scale_fill_gradient(low='turquoise', high = 'violet')+
coord_flip() +
labs(x = x,
y = y,
title= title) +
readable_labs
print(df_plot)
return (temp_df)
}
# Function to group songs and user by count and check it agains mean_target
target_vs_colcount <- function(df, col_name, x, y, title)
{
df %>%
group_by_(col_name) %>%
summarize(count = n(), mean_target = mean(target)) %>%
group_by(count) %>%
summarize(new_count = n(), avg_target = mean(mean_target)) %>%
rename(no_of_items = new_count, occurence = count) %>%
print %>%
arrange(desc(avg_target)) %>%
print %>%
ggplot(aes(occurence, avg_target)) +
geom_line(color='turquoise') +
geom_smooth(color='turquoise') +
labs(x = x,
y = y,
title= title) +
readable_labs
}We can see that my library has the most count and setting has the least count in the data set.
It looks like songs are played mostly through my library, search, radio and discover.
Can one really play a song from settings menu? Or the song was initiated (through a playlist?) when the user was in settings menu?
One interesting thing is that, if the song is from my library then it is more likely to be replayed within a month and if it is from radio then it is less likely.
My library is where the user stores their songs locally and hence they really love that song, leading to high mean_target.
On contrary, radio is a random shuffle of songs and hence the user likeability is not predefined leading to low mean_target.
target_vs_column(train, col_name = "source_system_tab",
x = 'Frequency',
y = 'Target',
title = 'Count of source_system_tab vs Target')Similar to source system tab, we can see that screens associated with my library have the most count.
Looks like KKBox users prefer downloaded music than live streaming. Payment (purchasing a single song?) has the highest repeatability but the count of that category is only 12 in the entire data set.
Local songs in general has higher repeatability.
target_vs_column(train, col_name = "source_screen_name",
x = 'Frequency',
y = 'Target',
title = 'Count of source_screen_name vs Target')Songs appearing in local playlist has a slightly more repeatability than local library.
May be the user liked the song so much to include in their local playlist that positively affects repeatability.
target_vs_column(train, col_name = "source_type",
x = 'Frequency',
y = 'Target',
title = 'Count of source_type vs Target')Song id and user id pair are unique in train data set
Songs are grouped together and their count is checked against the target variable.
The count of a song present in the train data set is almost linearly associated with the mean_target.
Assuming the train data set is randomnly drawn from the population, the more the song occurs the more it is discoverable by the user.
This plots shows the relationship between discoverability vs mean_target.
You could see that there are 166766 songs that are appearing only once and has a lower mean_target and a single song that is appearing 13293 time that has a higher mean_target.
target_vs_colcount(train, "song_id", "Song Occurence", "Target", "Song Occurence vs Target")## # A tibble: 1,798 x 3
## occurence no_of_items avg_target
## <int> <int> <dbl>
## 1 1 166766 0.3776309
## 2 2 48444 0.3787363
## 3 3 26319 0.3860202
## 4 4 16789 0.3852671
## 5 5 12023 0.3876071
## # ... with 1,793 more rows
## # A tibble: 1,798 x 3
## occurence no_of_items avg_target
## <int> <int> <dbl>
## 1 2752 1 0.8510174
## 2 13293 1 0.7941022
## 3 13973 1 0.7790024
## 4 6241 1 0.7655824
## 5 12855 1 0.7629716
## # ... with 1,793 more rows
If the user occurs more in the train data set (frequent listener) then it does not mean that they are more probable to repeat, given by the flat trend.
target_vs_colcount(train, "msno", "User Occurence", "Target", "User Occurence vs Target")## # A tibble: 1,564 x 3
## occurence no_of_items avg_target
## <int> <int> <dbl>
## 1 1 932 0.1620172
## 2 2 660 0.1643939
## 3 3 558 0.1296296
## 4 4 435 0.2120690
## 5 5 406 0.1768473
## # ... with 1,559 more rows
## # A tibble: 1,564 x 3
## occurence no_of_items avg_target
## <int> <int> <dbl>
## 1 1109 1 0.9846709
## 2 1013 1 0.8795656
## 3 2091 1 0.8675275
## 4 1742 1 0.8628014
## 5 820 1 0.8548780
## # ... with 1,559 more rows
train %>%
group_by(target) %>%
countLet’s look at the members df,
In members DF, city, bd, gender, registered via are categorical and registration init and expiration date are dates. Useful functions,
members_colgroup <- function(df,col_name, x, y, title, xmin, xmax, ymin, ymax)
{
temp_df <- df %>%
group_by_(col_name) %>%
count() %>%
arrange(desc(n))
df_plot <- temp_df %>%
ggplot(aes_string(col_name, "n")) +
geom_col(fill='goldenrod2') +
labs(x = x,
y = y,
title = title) +
xlim(xmin, xmax) +
ylim(ymin, ymax) +
readable_labs
print(df_plot)
return(temp_df)
}
members_date_count <- function(df, col_name, x, y, title)
{
df %>%
group_by_(month = month(col_name), year = year(col_name)) %>%
count() %>%
ungroup %>%
mutate(date = as.Date(paste(as.character(year), as.character(month), '01', sep='-')))
ggplot(aes(date, n))+
geom_line(color='goldenrod2', size=1) +
labs(x = x,
y = y,
title= title) +
xlim(xmin, xmax) +
readable_labs
}As mentioned in the data dictionary there seems to be outliers in the age field. There are negative values as well as values above 1000.
Sorted bd vs Frequency is shown in the tibble as well as the graph.
There are 19932 records with 0 as age. This could be either outliers or missing values.
Plotting in the age range 1 -100 to show the real distribution.
members_colgroup(members, "bd", "Age", "Frequency", "Age Distribution", 1, 100, 0, 1000)## Warning: Removed 16 rows containing missing values (position_stack).
City 1 seems to be highly dominating. But the number 19445 seems suspicious as it is close to the number of records with zero age.
City1 is also far from other city counts.
members_colgroup(members, "city", "City", "Frequency", "City Distribution", 0, 25, 0, 20000)Male and female are almost equal. We have a lot of missing gender.
members %>%
group_by(gender) %>%
countRegistration methods seem to be dominated mainly by 3,4,7 and 9.
members_colgroup(members, "registered_via", "Registration Method", "Frequency", "Registration method Distribution", 0, 16, 0, 15000)Setting date type,
members %<>%
mutate(registration_init_time = ymd(registration_init_time),
expiration_date = ymd(expiration_date))## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'zone/tz/2017c.1.0/
## zoneinfo/America/Edmonton'
We have members as far as 2005. But mostly we have users who signed up between later part of 2016 and early part of 2017. Almost 1/3 of the members have an expiration date of 9/2017.
#members_date_count(members, "registration_init_time", "Signup Date", "Number of Users", "Signup vs User Count")
reg_count <- members %>%
group_by(month = month(registration_init_time), year = year(registration_init_time)) %>%
count() %>%
ungroup %>%
mutate(date = as.Date(paste(as.character(year), as.character(month), '01', sep='-'))) %>%
arrange(desc(n)) %>%
print## # A tibble: 155 x 4
## month year n date
## <dbl> <dbl> <int> <date>
## 1 1 2017 2573 2017-01-01
## 2 12 2016 2545 2016-12-01
## 3 2 2017 2109 2017-02-01
## 4 11 2016 1432 2016-11-01
## 5 10 2016 978 2016-10-01
## # ... with 150 more rows
exp_count <- members %>%
group_by(month = month(expiration_date), year = year(expiration_date)) %>%
count() %>%
ungroup %>%
mutate(date = as.Date(paste(as.character(year), as.character(month), '01', sep='-'))) %>%
arrange(desc(n)) %>%
print## # A tibble: 138 x 4
## month year n date
## <dbl> <dbl> <int> <date>
## 1 9 2017 10681 2017-09-01
## 2 10 2017 4924 2017-10-01
## 3 12 2016 2407 2016-12-01
## 4 1 2017 2260 2017-01-01
## 5 2 2017 1918 2017-02-01
## # ... with 133 more rows
reg_count %>%
left_join(exp_count, by="date") %>%
ggplot() +
geom_line(aes(date, n.x), color='goldenrod2') +
geom_line(aes(date, n.y), color='mediumorchid') +
labs(y="Frequency", title="Registration and Expiration Distribution")+
readable_labsCity (marked as ‘1’), gender( empty character) and age(marked as 0) seems to be missing values.
While signing up for the app, may be these columns where not mandatory and the co existence of these values should point that
they arised from the same place.
There are 18356 records that match all three condition. There is certainly relationship between these missingness.
Gender and age missingness seems to be even more aggressive.
members %>%
mutate(cga = if_else(((city == 1) & (bd == 0) & (gender == "")), 1, 0),
cg = if_else(((city == 1) & (gender == "")), 1, 0),
ca = if_else(((city == 1) & (bd == 0)), 1, 0),
ga = if_else(((bd == 0) & (gender == "")), 1, 0)) %>%
summarize(city_gender_age = sum(cga),
city_gender = sum(cg),
city_age = sum(ca),
gender_age =sum(ga))Songs DF,
songsLet’s see top 100 frequent items in each category,
top_100 <- function(df, col_name)
{
temp_df <- df %>%
group_by_(col_name) %>%
count %>%
arrange(desc(n)) %>%
print
return(temp_df)
}artist_count <- top_100(songs, "artist_name")## # A tibble: 222,363 x 2
## # Groups: artist_name [222,363]
## artist_name n
## <chr> <int>
## 1 Various Artists 145916
## 2 証聲音樂圖書館 ECHO MUSIC 11276
## 3 Billy Vaughn 4828
## 4 รวมศิลปิน 4432
## 5 Richard Clayderman 4180
## # ... with 2.224e+05 more rows
lyricist_count <- top_100(songs, "lyricist")## # A tibble: 110,579 x 2
## # Groups: lyricist [110,579]
## lyricist n
## <chr> <int>
## 1 1945425
## 2 Traditional 1751
## 3 ― 1530
## 4 林夕 1044
## 5 Michael Ruland 832
## # ... with 1.106e+05 more rows
composer_count <- top_100(songs, "composer")## # A tibble: 329,299 x 2
## # Groups: composer [329,299]
## composer n
## <chr> <int>
## 1 1071350
## 2 Neuromancer 17888
## 3 Johann Sebastian Bach 12105
## 4 Wolfgang Amadeus Mozart 10839
## 5 Marco Rinaldo 10803
## # ... with 3.293e+05 more rows
language_count <- top_100(songs, "language")## # A tibble: 11 x 2
## # Groups: language [11]
## language n
## <dbl> <int>
## 1 52 1336694
## 2 -1 639467
## 3 3 106295
## 4 17 92518
## 5 24 41744
## # ... with 6 more rows
36373 songs have same artist and lyricist name.
1275586 songs have same lyricist and composer name.
144697 songs have same artist and composer name.
36373songs have same artist and lyricist name.
Genre id is a multi label column with a minumum label of 1 to a maximum label of 8.
There are 192 unique genres. There are some missing values as well.
genre_count <- songs %>%
separate(genre_ids, c("one", "two", "three", "four", "five", "six", "seven", "eight"), extra="merge") %>%
select(one:eight)%>%
gather(one:eight, key="nth_id", value="genre_ids", na.rm=TRUE) %>%
group_by(genre_ids) %>%
count %>%
arrange(desc(n)) %>%
print()## # A tibble: 192 x 2
## # Groups: genre_ids [192]
## genre_ids n
## <chr> <int>
## 1 465 589220
## 2 958 182836
## 3 1609 177258
## 4 2022 176531
## 5 2122 149608
## # ... with 187 more rows
Song length range from 0.003 minutes to 202.89 minutes. There are 13623 records that have length more than 15 minutes.
songs %>%
mutate(song_length = song_length/6e4) %>%
ggplot(aes(song_length)) +
geom_histogram(binwidth = 0.25, fill='darkorchid3') +
labs(x='Song Length', y = 'Frequency', title = 'Distribution of song length') +
xlim(0, 15)test <- as.tibble(fread('/Users/kailukowiak/Data_607_Final_Project/test.csv'))##
Read 34.0% of 2556790 rows
Read 81.0% of 2556790 rows
Read 2556790 rows and 6 (of 6) columns from 0.324 GB file in 00:00:04
Let’s compare the test and train data frames.
test_train_plot <- function(train, test, col_name, x, y)
{
test %>%
group_by_(col_name) %>%
summarize(count = n()) %>%
left_join(train %>%
group_by_(col_name) %>%
summarize(count = n()) , by=col_name) %>%
mutate(ratio = count.x/count.y) %>%
rename(test_cnt = count.x, train_cnt = count.y) %>%
arrange(ratio) %>%
print %>%
ggplot() +
geom_col(aes_string(col_name, "train_cnt"), fill='red', alpha = 0.5) +
geom_col(aes_string(col_name, "test_cnt"), fill='blue', alpha = 0.5) +
coord_flip() +
labs(x = x, y= y)+
readable_labs
}Training set had more records from my library compared to test.
test_train_plot(train, test, col_name = "source_system_tab", 'Source system tab', 'Test/Train record Count')## # A tibble: 10 x 4
## source_system_tab test_cnt train_cnt ratio
## <chr> <int> <int> <dbl>
## 1 my library 1019492 3684730 0.2766802
## 2 5096 18371 0.2773937
## 3 settings 633 2200 0.2877273
## 4 notification 2124 6185 0.3434115
## 5 explore 66023 167949 0.3931134
## # ... with 5 more rows
test_train_plot(train, test, col_name = "source_screen_name", "Source Screen Name", "Test/Train Count")## # A tibble: 23 x 4
## source_screen_name test_cnt train_cnt ratio
## <chr> <int> <int> <dbl>
## 1 Local playlist more 845115 3228202 0.2617912
## 2 Concert 13 47 0.2765957
## 3 My library_Search 2114 6451 0.3277011
## 4 Discover New 5277 15955 0.3307427
## 5 My library 25559 75980 0.3363912
## # ... with 18 more rows
## Warning: Removed 2 rows containing missing values (position_stack).
test_train_plot(train, test, col_name = "source_type", "Source Type", "Test/Train Count")## # A tibble: 13 x 4
## source_type test_cnt train_cnt ratio
## <chr> <int> <int> <dbl>
## 1 artist 428 3038 0.1408822
## 2 local-library 582346 2261399 0.2575158
## 3 local-playlist 294537 1079503 0.2728450
## 4 7297 21539 0.3387808
## 5 online-playlist 774532 1967924 0.3935782
## # ... with 8 more rows
Let’s create features in the song data frame that indicates the frequency of a particular item in the data set.
# <> is from magrittr package that is used for assiging it back the result
songs %<>%
left_join(artist_count, by='artist_name') %>%
left_join(lyricist_count, by='lyricist') %>%
left_join(composer_count, by='composer') %>%
left_join(language_count, by='language') %>%
rename(art_cnt = n.x, lyr_cnt = n.y, cmp_cnt = n.x.x, lng_cnt = n.y.y)Each song can be tagged with 1-8 genres. Lets create a feature that shows number og genres a song is tagged to as well as the frequency if each genre.
# Multiple Joins with a smaller data set is much cheaper than lookup
songs %<>%
add_column(no_of_genre = 1:dim(.)[1],
avg_genre_cnt = 1:dim(.)[1]) %>%
separate(genre_ids, c("one", "two", "three", "four", "five", "six", "seven", "eight"), extra="merge") %>%
left_join(genre_count, by = c("one" = "genre_ids")) %>%
left_join(genre_count, by = c("two" = "genre_ids"), suffix = c(".one", ".two")) %>%
left_join(genre_count, by = c("three" = "genre_ids")) %>%
left_join(genre_count, by = c("four" = "genre_ids"), suffix = c(".three", ".four")) %>%
left_join(genre_count, by = c("five" = "genre_ids")) %>%
left_join(genre_count, by = c("six" = "genre_ids"), suffix = c(".five", ".six")) %>%
left_join(genre_count, by = c("seven" = "genre_ids")) %>%
left_join(genre_count, by = c("eight" = "genre_ids"), suffix = c(".seven", ".eight")) ## Warning: Too few values at 2296309 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9,
## 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...
songs %<>%
replace_na(list(n.one = 0, n.two = 0, n.three = 0, n.four = 0,
n.five = 0, n.six = 0, n.seven = 0, n.eight = 0)) %>%
mutate(no_of_genre = (if_else(n.one == 0, 0, 1) + if_else(n.two == 0, 0, 1) +
if_else(n.three == 0, 0, 1) + if_else(n.four == 0, 0, 1) +
if_else(n.five == 0, 0, 1) + if_else(n.six == 0, 0, 1) +
if_else(n.seven == 0, 0, 1) + if_else(n.eight == 0, 0, 1)),
avg_genre_cnt = (n.one + n.two + n.three + n.four +
n.five + n.six + n.seven + n.eight)/no_of_genre) %>%
select(song_id, song_length, language, art_cnt:lng_cnt, no_of_genre, one, n.one, avg_genre_cnt)count_frame <- function(df, col_name, new_name)
{
return(df %>%
group_by_(col_name) %>%
count %>%
rename_(.dots=setNames('n', new_name)))
}train_song_cnt <- count_frame(train, 'song_id', 'song_cnt')
train_sst <- count_frame(train, 'source_system_tab', 'sst_cnt')
train_ssn <- count_frame(train, 'source_screen_name', 'ssn_cnt')
train_st <- count_frame(train, 'source_type', 'st_cnt')# Reducing the number of categories into four categories based on interest (approximation)
# 0 - high interest - local and search
# 1 - random on internet
# 2 - random
# 3 - social
train %<>%
mutate(sst = ifelse((source_system_tab %in% c('my library', 'search')), 0,
ifelse((source_system_tab %in% c('discover', 'explore', 'radio')), 1,
ifelse((source_system_tab %in% c('null', '', 'notification', 'settings')), 2, 3)))) %>%
mutate(ssn = ifelse((source_screen_name %in% c('Payment', 'My library', 'My library_Search',
'Local playlist more', 'Search')), 0,
ifelse((source_screen_name %in% c('Album more', 'Artist more', 'Concert', 'Discover Chart',
'Discover Feature', 'Discover Genre', 'Discover New',
'Explore', 'Radio')), 1,
ifelse((source_screen_name %in% c('People global', 'People local', 'Search Home',
'Search Trends', ' Self Profile more')), 2, 3)))) %>%
mutate(st = ifelse((source_type %in% c('local-library', 'local-playlist')), 0,
ifelse((source_type %in% c('artist', 'album', 'my-daily-playlist',
'online-playlist', 'radio', 'song-based-playlist',
'top-hits-for-artist', 'topic-article-playlist', 'song')), 1, 2))) target_vs_column(train, col_name = "st",
x = 'Frequency',
y = 'Target',
title = 'Count of source_system_tab vs Target')target_vs_column(train, col_name = "ssn",
x = 'Frequency',
y = 'Target',
title = 'Count of source_system_tab vs Target')target_vs_column(train, col_name = "sst",
x = 'Frequency',
y = 'Target',
title = 'Count of source_system_tab vs Target')#cormat <- round(cor(train),2)# %>%
#gather(key = ')